library(readr) #to load csv data.
library(dplyr) #data manipulation
library(ggplot2)
library(plotly)
library(DataExplorer)
library(naniar)
library(broom)
library(DT)
housing_data<-read_csv('data/challenge 1 dataset (housing).csv')
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_double(),
## learner_id = col_double(),
## program_code = col_character(),
## variation_code = col_character(),
## message_in = col_character(),
## message_out = col_character(),
## created_at = col_datetime(format = ""),
## user_response = col_character()
## )
Introduce the data
introduce(housing_data)
## # A tibble: 1 x 9
## rows columns discrete_columns continuous_colu~ all_missing_col~
## <int> <int> <int> <int> <int>
## 1 422868 8 6 2 0
## # ... with 4 more variables: total_missing_values <int>,
## # complete_rows <int>, total_observations <int>, memory_usage <dbl>
Plot the data introduction
plot_intro(housing_data)
Look at the columns that have missing values
miss_var_summary(housing_data)
## # A tibble: 8 x 3
## variable n_miss pct_miss
## <chr> <int> <dbl>
## 1 user_response 13593 3.21
## 2 message_in 1813 0.429
## 3 message_out 146 0.0345
## 4 X1 0 0
## 5 learner_id 0 0
## 6 program_code 0 0
## 7 variation_code 0 0
## 8 created_at 0 0
Plot the missing data
plot_missing(housing_data)
The columns that have missing data have less than 5% of their values missing and since we have relatively many observations we may just drop the observations that have these missing values. However I will retain these observations for the purposes of answering the questions.
Look at the internal structure
glimpse(housing_data)
## Observations: 422,868
## Variables: 8
## $ X1 <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, ...
## $ learner_id <dbl> 8, 8, 8, 8, 8, 11, 11, 11, 11, 11, 11, 11, 11, ...
## $ program_code <chr> "HFH", "HFH", "PL", "PL", "PL", "HFH", "HFH", "...
## $ variation_code <chr> "HFH", "HFH", "PL", "PL", "PL", "HFH", "HFH", "...
## $ message_in <chr> "hfh", "a", "1", "a", "a", "HFH", "A", "1", "A"...
## $ message_out <chr> "Housing is a basic need but not everyone can a...
## $ created_at <dttm> 2017-12-20 11:55:19, 2017-12-20 11:56:19, 2017...
## $ user_response <chr> "a", "1", "a", "a", "ACCESS|DIGI", "A", "1", "A...
nrow(housing_data)
## [1] 422868
422868 Learners
housing_data%>%
group_by(program_code)%>%tally(n='total_number_of_learners')%>%arrange(desc(total_number_of_learners))%>%head(3)
## # A tibble: 3 x 2
## program_code total_number_of_learners
## <chr> <int>
## 1 PLB 123652
## 2 HFS 78505
## 3 SF 55654
The three most popular trainings were:
program_code total_number_of_learners PLB (Plan your money to build) 123652
HFS (Survey) 78505
SF (Select fundi) 55654
housing_data%>%
group_by(learner_id)%>%tally(n='total_number_of_interactions')%>%arrange(desc(total_number_of_interactions))%>%head(1)
## # A tibble: 1 x 2
## learner_id total_number_of_interactions
## <dbl> <int>
## 1 648424 1172
learner_id total_number_of_interactions 648424 1172
housing_data%>%
group_by(learner_id)%>%tally(n='total_number_of_interactions')%>%filter(total_number_of_interactions>100)%>%nrow()
## [1] 756
756
housing_data%>%
group_by(learner_id)%>%tally(n='total_number_of_interactions')%>%filter(total_number_of_interactions <= 100)%>%nrow()
## [1] 14885
14885
# derive data for learners that had 100 or less interactions
interactions_count_by_id<-housing_data%>%
group_by(learner_id)%>%tally(n='interactions_count')%>%filter(interactions_count <= 100)
summary(interactions_count_by_id$interactions_count)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 4.00 14.00 20.87 30.00 100.00
The mean is greater than the median hence interactions count is positively skewed.
ggplotly(
interactions_count_by_id%>%
ggplot(aes(interactions_count))+
geom_histogram(bins = 10))
housing_data%>%
group_by(weekdays(created_at))%>%
tally()%>%arrange(desc(n))%>%head(1)
## # A tibble: 1 x 2
## `weekdays(created_at)` n
## <chr> <int>
## 1 Wednesday 145590
Wednesday with 145590 interactions.
housing_data%>%
group_by(weekdays(created_at))%>%
tally()%>%arrange(n)%>%head(1)
## # A tibble: 1 x 2
## `weekdays(created_at)` n
## <chr> <int>
## 1 Monday 14096
Monday with 14096 interactions.